# cd /projects/data_commons/cw_code/
# bash zb_r.sh "2" "6" "cw_ind_3_names.R" "" ""

sys_args <- commandArgs(trailingOnly=T)
sys_batch <- ifelse(interactive(),0,1)

print(sys_args)

dir_dc <- "/projects/data_commons/"
dir_proj <- dir_dc
dir_proc <- paste0(dir_proj, "cw/")
setwd(dir_proj)

library(data.table)
library(haven)

print(paste0("Started at ", Sys.time()))
print(sessionInfo())

#===============================================================================
# We don't have a string variable for industries. It is often hard to understand
# what an industry is. As the first step, we use the fk_naics within the industry that
# has the most employment to represent the industry. Then we could use the string
# for naics code to approximate the name of our industry.

dt_raw <- data.table(haven::read_dta(paste0(dir_dc,"lbdrev/lbdrev_clean_sum_fk.dta")))
setnames(dt_raw,"ch_ind12","ind")

dt_sum <- dt_raw[,lapply(.SD,sum),by=c("year","ind","fk_naics"),.SDcols=c("emp")][!is.na(ind)]
dt_sum[, emp_max:=max(emp), by=c("year","ind")]
dt_sum[, nfk:=.N, by=c("year","ind")]
dt_sum[, nfk_max:=max(nfk), by=c("ind")]

dt_out <- dt_sum[emp==emp_max,]

f_mode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v,uniqv)))]
}

dt_out <- dt_out[,lapply(.SD,f_mode), by=c("ind"), .SDcols=c("fk_naics")]
setnames(dt_out, "fk_naics", "ind_naics")

# Mark industries if it has only one fk
dt_out2 <- unique(dt_sum[nfk_max==1,c("ind")])[,uniqfk:=1]
dt_out <- merge(dt_out,dt_out2,by=c("ind"), all.x=T)
dt_out[is.na(uniqfk),uniqfk:=0]

#===============================================================================
# Add sector info and export

dt_sect <- data.table(haven::read_dta(paste0(dir_dc,"cw/cw_ind_sect_fk_imp.dta")))
setnames(dt_sect,"ch_ind","ind")

dt_sectf <- readRDS(paste0(dir_dc,"cw/cw_ind_sect_factor.rds"))
dt_sect <- merge(dt_sectf,dt_sect, by=c("sector"))[, ch_ind_rev:=NULL]

dt_sect[,ind:=round(ind,1)]
dt_out[,ind:=round(ind,1)]
dt_out <- merge(dt_sect,dt_out, by=c("ind"), all.y=T)
setorder(dt_out,sectf,ind)

saveRDS(dt_out, paste0(dir_proc,"cw_ind_names.rds"))

print(paste0("Ended at ", Sys.time()))
# End of R Script